%matplotlib notebook
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.dates as md
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
import pandas_profiling
#folder where files are stored, it will be changed for blob storage
FOLDER_PATH = '.\InterfaceTraffic\\'
#list all files
files = os.listdir(FOLDER_PATH)
#loading first file to get column names set
df = pd.read_csv(FOLDER_PATH + files[0])
original_columns = df.columns
display(df.head())
#load the rest of files
df = df.append(pd.concat(
[pd.read_csv(FOLDER_PATH+f, names=original_columns, header=0) for f in files[1:]], ignore_index=True))
| NodeID | InterfaceID | DateTime | In_Averagebps | In_Minbps | In_Maxbps | In_TotalBytes | In_TotalPkts | In_AvgUniCastPkts | In_MinUniCastPkts | ... | Out_Maxbps | Out_TotalBytes | Out_TotalPkts | Out_AvgUniCastPkts | Out_MaxUniCastPkts | Out_MinUniCastPkts | Out_AvgMultiCastPkts | Out_MinMultiCastPkts | Out_MaxMultiCastPkts | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3071 | 10912 | 11/14/2018 0:03 | 33568.895 | 33568.895 | 33568.895 | 2275179 | 6602 | 12.176065 | 12.176065 | ... | 33775.219 | 2289163 | 8177 | 15.080837 | 15.080837 | 15.080837 | 0 | 0 | 0 | 0 |
| 1 | 3071 | 10912 | 11/14/2018 0:12 | 23912.543 | 23912.543 | 23912.543 | 1614121 | 4978 | 9.218379 | 9.218379 | ... | 25298.492 | 1707674 | 6625 | 12.268333 | 12.268333 | 12.268333 | 0 | 0 | 0 | 0 |
| 2 | 3071 | 10912 | 11/14/2018 0:21 | 24765.508 | 24765.508 | 24765.508 | 1671697 | 5049 | 9.349859 | 9.349859 | ... | 26966.318 | 1820254 | 6656 | 12.325740 | 12.325740 | 12.325740 | 0 | 0 | 0 | 0 |
| 3 | 3071 | 10912 | 11/14/2018 0:30 | 36084.949 | 36084.949 | 36084.949 | 2435771 | 5414 | 10.025774 | 10.025774 | ... | 26080.227 | 1760442 | 6726 | 12.455367 | 12.455367 | 12.455367 | 0 | 0 | 0 | 0 |
| 4 | 3071 | 10912 | 11/14/2018 0:39 | 23879.447 | 23879.447 | 23879.447 | 1611887 | 4751 | 8.798016 | 8.798016 | ... | 24752.441 | 1670815 | 6205 | 11.490567 | 11.490567 | 11.490567 | 0 | 0 | 0 | 0 |
5 rows × 26 columns
#check types
print(df.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 1150 entries, 0 to 1122 Data columns (total 26 columns): NodeID 1145 non-null object InterfaceID 1140 non-null float64 DateTime 1140 non-null object In_Averagebps 1138 non-null float64 In_Minbps 1138 non-null float64 In_Maxbps 1138 non-null float64 In_TotalBytes 1138 non-null float64 In_TotalPkts 1140 non-null float64 In_AvgUniCastPkts 1140 non-null float64 In_MinUniCastPkts 1140 non-null float64 In_MaxUniCastPkts 1140 non-null float64 In_AvgMultiCastPkts 1140 non-null float64 In_MinMultiCastPkts 1140 non-null float64 In_MaxMultiCastPkts 1140 non-null float64 Out_Averagebps 1139 non-null float64 Out_Minbps 1139 non-null float64 Out_Maxbps 1139 non-null float64 Out_TotalBytes 1140 non-null float64 Out_TotalPkts 1140 non-null float64 Out_AvgUniCastPkts 1140 non-null float64 Out_MaxUniCastPkts 1140 non-null float64 Out_MinUniCastPkts 1140 non-null float64 Out_AvgMultiCastPkts 1140 non-null float64 Out_MinMultiCastPkts 1140 non-null float64 Out_MaxMultiCastPkts 1140 non-null float64 Label 1140 non-null float64 dtypes: float64(24), object(2) memory usage: 242.6+ KB None
#check nulls
display(df.isnull().sum(axis = 0))
# tail to check end of file
display(df.tail())
NodeID 5 InterfaceID 10 DateTime 10 In_Averagebps 12 In_Minbps 12 In_Maxbps 12 In_TotalBytes 12 In_TotalPkts 10 In_AvgUniCastPkts 10 In_MinUniCastPkts 10 In_MaxUniCastPkts 10 In_AvgMultiCastPkts 10 In_MinMultiCastPkts 10 In_MaxMultiCastPkts 10 Out_Averagebps 11 Out_Minbps 11 Out_Maxbps 11 Out_TotalBytes 10 Out_TotalPkts 10 Out_AvgUniCastPkts 10 Out_MaxUniCastPkts 10 Out_MinUniCastPkts 10 Out_AvgMultiCastPkts 10 Out_MinMultiCastPkts 10 Out_MaxMultiCastPkts 10 Label 10 dtype: int64
| NodeID | InterfaceID | DateTime | In_Averagebps | In_Minbps | In_Maxbps | In_TotalBytes | In_TotalPkts | In_AvgUniCastPkts | In_MinUniCastPkts | ... | Out_Maxbps | Out_TotalBytes | Out_TotalPkts | Out_AvgUniCastPkts | Out_MaxUniCastPkts | Out_MinUniCastPkts | Out_AvgMultiCastPkts | Out_MinMultiCastPkts | Out_MaxMultiCastPkts | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1118 | 3071 | 10912.0 | 11/21/2018 23:23 | 25717.123 | 25717.123 | 25717.123 | 1735932.0 | 5049.0 | 9.349859 | 9.349859 | ... | 28901.609 | 1950888.0 | 6831.0 | 12.649809 | 12.649809 | 12.649809 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1119 | 3071 | 10912.0 | 11/21/2018 23:32 | 36598.938 | 36598.938 | 36598.938 | 2470537.0 | 7494.0 | 13.877167 | 13.877167 | ... | 42309.723 | 2856032.0 | 9474.0 | 17.543673 | 17.543673 | 17.543673 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1120 | 3071 | 10912.0 | 11/21/2018 23:41 | 40754.551 | 40754.551 | 40754.551 | 2750894.0 | 5723.0 | 10.598295 | 10.598295 | ... | 28041.367 | 1892766.0 | 7273.0 | 13.468705 | 13.468705 | 13.468705 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1121 | 3071 | 10912.0 | 11/21/2018 23:50 | 31373.359 | 31373.359 | 31373.359 | 2117795.0 | 5893.0 | 10.912482 | 10.912482 | ... | 29864.063 | 2015913.0 | 7613.0 | 14.097528 | 14.097528 | 14.097528 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1122 | 3071 | 10912.0 | 11/21/2018 23:59 | 26111.961 | 26111.961 | 26111.961 | 1762533.0 | 5145.0 | 9.527909 | 9.527909 | ... | 27553.537 | 1859838.0 | 6897.0 | 12.772399 | 12.772399 | 12.772399 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 26 columns
#removing Nulls.
# in some files there was the Query note "(160 rows affected)" appended as a line
# which was bringing nulls
display(df.dropna(inplace=True))
#df = df[~df['In_AvgMultiCastPkts'].isin(['----','[A-Za-z]'])]
#forcing casting for IDs as strings
df.InterfaceID = df.InterfaceID.astype('str')
df.NodeID = df.NodeID.astype('str')
None
#set column 'DateTime' as datetime for time manipulation
df.DateTime = pd.to_datetime(df.DateTime)
#get description of variables
display(df.describe())
#check unique values
display(df.nunique())
| In_Averagebps | In_Minbps | In_Maxbps | In_TotalBytes | In_TotalPkts | In_AvgUniCastPkts | In_MinUniCastPkts | In_MaxUniCastPkts | In_AvgMultiCastPkts | In_MinMultiCastPkts | ... | Out_Maxbps | Out_TotalBytes | Out_TotalPkts | Out_AvgUniCastPkts | Out_MaxUniCastPkts | Out_MinUniCastPkts | Out_AvgMultiCastPkts | Out_MinMultiCastPkts | Out_MaxMultiCastPkts | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.137000e+03 | 1.137000e+03 | 1.137000e+03 | 1.137000e+03 | 1137.000000 | 1137.000000 | 1137.000000 | 1137.000000 | 1137.0 | 1137.0 | ... | 1.137000e+03 | 1.137000e+03 | 1137.000000 | 1137.000000 | 1137.000000 | 1137.000000 | 1137.0 | 1137.0 | 1137.0 | 1137.000000 |
| mean | 1.212335e+05 | 1.212335e+05 | 1.212335e+05 | 8.152142e+06 | 16285.093228 | 30.389581 | 30.389581 | 30.389581 | 0.0 | 0.0 | ... | 7.005924e+04 | 4.693381e+06 | 18697.883026 | 34.910087 | 34.910087 | 34.910087 | 0.0 | 0.0 | 0.0 | 0.007036 |
| std | 1.754134e+05 | 1.754134e+05 | 1.754134e+05 | 1.183763e+07 | 18885.520328 | 35.335937 | 35.335937 | 35.335937 | 0.0 | 0.0 | ... | 8.121224e+04 | 5.420891e+06 | 20369.515221 | 38.277495 | 38.277495 | 38.277495 | 0.0 | 0.0 | 0.0 | 0.083622 |
| min | 1.835818e+04 | 1.835818e+04 | 1.835818e+04 | 7.131600e+04 | 260.000000 | 7.538775 | 7.538775 | 7.538775 | 0.0 | 0.0 | ... | 1.940066e+04 | 5.498200e+04 | 306.000000 | 10.231919 | 10.231919 | 10.231919 | 0.0 | 0.0 | 0.0 | 0.000000 |
| 25% | 2.757859e+04 | 2.757859e+04 | 2.757859e+04 | 1.862305e+06 | 5264.000000 | 9.835027 | 9.835027 | 9.835027 | 0.0 | 0.0 | ... | 2.725935e+04 | 1.839981e+06 | 6848.000000 | 12.681291 | 12.681291 | 12.681291 | 0.0 | 0.0 | 0.0 | 0.000000 |
| 50% | 5.631495e+04 | 5.631495e+04 | 5.631495e+04 | 3.801317e+06 | 6912.000000 | 13.000000 | 13.000000 | 13.000000 | 0.0 | 0.0 | ... | 3.160374e+04 | 2.136669e+06 | 8222.000000 | 15.225093 | 15.225093 | 15.225093 | 0.0 | 0.0 | 0.0 | 0.000000 |
| 75% | 1.356932e+05 | 1.356932e+05 | 1.356932e+05 | 9.082668e+06 | 21030.000000 | 39.000000 | 39.000000 | 39.000000 | 0.0 | 0.0 | ... | 9.080104e+04 | 6.095726e+06 | 22393.000000 | 42.293781 | 42.293781 | 42.293781 | 0.0 | 0.0 | 0.0 | 0.000000 |
| max | 1.379918e+06 | 1.379918e+06 | 1.379918e+06 | 9.314859e+07 | 155819.000000 | 288.000000 | 288.000000 | 288.000000 | 0.0 | 0.0 | ... | 1.120072e+06 | 7.563884e+07 | 147945.000000 | 273.849240 | 273.849240 | 273.849240 | 0.0 | 0.0 | 0.0 | 1.000000 |
8 rows × 23 columns
NodeID 1 InterfaceID 1 DateTime 1137 In_Averagebps 1137 In_Minbps 1137 In_Maxbps 1137 In_TotalBytes 1131 In_TotalPkts 1070 In_AvgUniCastPkts 770 In_MinUniCastPkts 770 In_MaxUniCastPkts 770 In_AvgMultiCastPkts 1 In_MinMultiCastPkts 1 In_MaxMultiCastPkts 1 Out_Averagebps 1134 Out_Minbps 1134 Out_Maxbps 1134 Out_TotalBytes 1123 Out_TotalPkts 1045 Out_AvgUniCastPkts 1124 Out_MaxUniCastPkts 1124 Out_MinUniCastPkts 1124 Out_AvgMultiCastPkts 1 Out_MinMultiCastPkts 1 Out_MaxMultiCastPkts 1 Label 2 dtype: int64
# Names & Number of rows and columns
print('Number or rows:', df.shape[0])
print('Number or columns:', df.shape[1])
#print last 4 transposed rows
display(df.tail(4).T)
Number or rows: 1137 Number or columns: 26
| 1119 | 1120 | 1121 | 1122 | |
|---|---|---|---|---|
| NodeID | 3071 | 3071 | 3071 | 3071 |
| InterfaceID | 10912.0 | 10912.0 | 10912.0 | 10912.0 |
| DateTime | 2018-11-21 23:32:00 | 2018-11-21 23:41:00 | 2018-11-21 23:50:00 | 2018-11-21 23:59:00 |
| In_Averagebps | 36598.9 | 40754.6 | 31373.4 | 26112 |
| In_Minbps | 36598.9 | 40754.6 | 31373.4 | 26112 |
| In_Maxbps | 36598.9 | 40754.6 | 31373.4 | 26112 |
| In_TotalBytes | 2.47054e+06 | 2.75089e+06 | 2.1178e+06 | 1.76253e+06 |
| In_TotalPkts | 7494 | 5723 | 5893 | 5145 |
| In_AvgUniCastPkts | 13.8772 | 10.5983 | 10.9125 | 9.52791 |
| In_MinUniCastPkts | 13.8772 | 10.5983 | 10.9125 | 9.52791 |
| In_MaxUniCastPkts | 13.8772 | 10.5983 | 10.9125 | 9.52791 |
| In_AvgMultiCastPkts | 0 | 0 | 0 | 0 |
| In_MinMultiCastPkts | 0 | 0 | 0 | 0 |
| In_MaxMultiCastPkts | 0 | 0 | 0 | 0 |
| Out_Averagebps | 42309.7 | 28041.4 | 29864.1 | 27553.5 |
| Out_Minbps | 42309.7 | 28041.4 | 29864.1 | 27553.5 |
| Out_Maxbps | 42309.7 | 28041.4 | 29864.1 | 27553.5 |
| Out_TotalBytes | 2.85603e+06 | 1.89277e+06 | 2.01591e+06 | 1.85984e+06 |
| Out_TotalPkts | 9474 | 7273 | 7613 | 6897 |
| Out_AvgUniCastPkts | 17.5437 | 13.4687 | 14.0975 | 12.7724 |
| Out_MaxUniCastPkts | 17.5437 | 13.4687 | 14.0975 | 12.7724 |
| Out_MinUniCastPkts | 17.5437 | 13.4687 | 14.0975 | 12.7724 |
| Out_AvgMultiCastPkts | 0 | 0 | 0 | 0 |
| Out_MinMultiCastPkts | 0 | 0 | 0 | 0 |
| Out_MaxMultiCastPkts | 0 | 0 | 0 | 0 |
| Label | 0 | 0 | 0 | 0 |
#check columns
df.columns
Index(['NodeID', 'InterfaceID', 'DateTime', 'In_Averagebps', 'In_Minbps',
'In_Maxbps', 'In_TotalBytes', 'In_TotalPkts', 'In_AvgUniCastPkts',
'In_MinUniCastPkts', 'In_MaxUniCastPkts', 'In_AvgMultiCastPkts',
'In_MinMultiCastPkts', 'In_MaxMultiCastPkts', 'Out_Averagebps',
'Out_Minbps', 'Out_Maxbps', 'Out_TotalBytes', 'Out_TotalPkts',
'Out_AvgUniCastPkts', 'Out_MaxUniCastPkts', 'Out_MinUniCastPkts',
'Out_AvgMultiCastPkts', 'Out_MinMultiCastPkts', 'Out_MaxMultiCastPkts',
'Label'],
dtype='object')
Let's get a quick snapshot of the dataset with the help of pandas_profiling a module that gives much more information than pandas' describe(). It provides Quantile Statistics, Descriptive statistics, most frequent values, histograms, correlations and more.
print('Computing profiling report')
pfr = pandas_profiling.ProfileReport(df,correlation_threshold = 0.99999)
#pfr.to_file('ProfileReport.html')
display(pfr)
Computing profiling report
Dataset info
| Number of variables | 27 |
|---|---|
| Number of observations | 1137 |
| Total Missing (%) | 0.0% |
| Total size in memory | 239.9 KiB |
| Average record size in memory | 216.1 B |
Variables types
| Numeric | 9 |
|---|---|
| Categorical | 0 |
| Boolean | 1 |
| Date | 1 |
| Text (Unique) | 0 |
| Rejected | 16 |
| Unsupported | 0 |
Warnings
In_AvgMultiCastPkts has constant value 0 RejectedIn_MaxMultiCastPkts has constant value 0 RejectedIn_MaxUniCastPkts is highly correlated with In_MinUniCastPkts (ρ = 1) RejectedIn_Maxbps is highly correlated with In_Minbps (ρ = 1) RejectedIn_MinMultiCastPkts has constant value 0 RejectedIn_MinUniCastPkts is highly correlated with In_AvgUniCastPkts (ρ = 1) RejectedIn_Minbps is highly correlated with In_Averagebps (ρ = 1) RejectedInterfaceID has constant value 10912.0 RejectedNodeID has constant value 3071 RejectedOut_AvgMultiCastPkts has constant value 0 RejectedOut_MaxMultiCastPkts has constant value 0 RejectedOut_MaxUniCastPkts is highly correlated with Out_AvgUniCastPkts (ρ = 1) RejectedOut_Maxbps is highly correlated with Out_Minbps (ρ = 1) RejectedOut_MinMultiCastPkts has constant value 0 RejectedOut_MinUniCastPkts is highly correlated with Out_MaxUniCastPkts (ρ = 1) RejectedOut_Minbps is highly correlated with Out_Averagebps (ρ = 1) RejectedDateTime
Date
| Distinct count | 1137 |
|---|---|
| Unique (%) | 100.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Minimum | 2018-11-14 00:03:00 |
|---|---|
| Maximum | 2018-11-21 23:59:00 |
In_Averagebps
Numeric
| Distinct count | 1137 |
|---|---|
| Unique (%) | 100.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 121230 |
|---|---|
| Minimum | 18358 |
| Maximum | 1379900 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 18358 |
|---|---|
| 5-th percentile | 21737 |
| Q1 | 27579 |
| Median | 56315 |
| Q3 | 135690 |
| 95-th percentile | 424070 |
| Maximum | 1379900 |
| Range | 1361600 |
| Interquartile range | 108110 |
Descriptive statistics
| Standard deviation | 175410 |
|---|---|
| Coef of variation | 1.4469 |
| Kurtosis | 15.428 |
| Mean | 121230 |
| MAD | 109430 |
| Skewness | 3.5119 |
| Sum | 137840000 |
| Variance | 30770000000 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 89065.484 | 1 | 0.1% |
|
| 183185.31 | 1 | 0.1% |
|
| 553636.25 | 1 | 0.1% |
|
| 250184.13 | 1 | 0.1% |
|
| 33408.652 | 1 | 0.1% |
|
| 22358.441000000003 | 1 | 0.1% |
|
| 68914.406 | 1 | 0.1% |
|
| 172620.34 | 1 | 0.1% |
|
| 196422.31 | 1 | 0.1% |
|
| 22700.586 | 1 | 0.1% |
|
| Other values (1127) | 1127 | 99.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 18358.178 | 1 | 0.1% |
|
| 18576.387 | 1 | 0.1% |
|
| 18804.945 | 1 | 0.1% |
|
| 18965.92 | 1 | 0.1% |
|
| 18981.891 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1166574.5 | 1 | 0.1% |
|
| 1167917.0 | 1 | 0.1% |
|
| 1289838.5 | 1 | 0.1% |
|
| 1341904.0 | 1 | 0.1% |
|
| 1379918.4 | 1 | 0.1% |
|
In_AvgMultiCastPkts
Constant
This variable is constant and should be ignored for analysis
| Constant value | 0 |
|---|
In_AvgUniCastPkts
Numeric
| Distinct count | 770 |
|---|---|
| Unique (%) | 67.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 30.39 |
|---|---|
| Minimum | 7.5388 |
| Maximum | 288 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 7.5388 |
|---|---|
| 5-th percentile | 8.3687 |
| Q1 | 9.835 |
| Median | 13 |
| Q3 | 39 |
| 95-th percentile | 110.03 |
| Maximum | 288 |
| Range | 280.46 |
| Interquartile range | 29.165 |
Descriptive statistics
| Standard deviation | 35.336 |
|---|---|
| Coef of variation | 1.1628 |
| Kurtosis | 6.3002 |
| Mean | 30.39 |
| MAD | 25.845 |
| Skewness | 2.2911 |
| Sum | 34553 |
| Variance | 1248.6 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 9.0 | 69 | 6.1% |
|
| 12.0 | 39 | 3.4% |
|
| 10.0 | 32 | 2.8% |
|
| 13.0 | 27 | 2.4% |
|
| 11.0 | 21 | 1.8% |
|
| 14.0 | 20 | 1.8% |
|
| 15.0 | 19 | 1.7% |
|
| 16.0 | 13 | 1.1% |
|
| 8.0 | 13 | 1.1% |
|
| 18.0 | 9 | 0.8% |
|
| Other values (760) | 875 | 77.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 7.538775 | 1 | 0.1% |
|
| 7.6281042 | 1 | 0.1% |
|
| 7.8329883 | 1 | 0.1% |
|
| 7.8683295 | 1 | 0.1% |
|
| 7.8832145 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 189.0 | 1 | 0.1% |
|
| 191.0 | 1 | 0.1% |
|
| 196.0 | 1 | 0.1% |
|
| 241.0 | 1 | 0.1% |
|
| 288.0 | 1 | 0.1% |
|
In_MaxMultiCastPkts
Constant
This variable is constant and should be ignored for analysis
| Constant value | 0 |
|---|
In_MaxUniCastPkts
Highly correlated
This variable is highly correlated with In_MinUniCastPkts and should be ignored for analysis
| Correlation | 1 |
|---|
In_Maxbps
Highly correlated
This variable is highly correlated with In_Minbps and should be ignored for analysis
| Correlation | 1 |
|---|
In_MinMultiCastPkts
Constant
This variable is constant and should be ignored for analysis
| Constant value | 0 |
|---|
In_MinUniCastPkts
Highly correlated
This variable is highly correlated with In_AvgUniCastPkts and should be ignored for analysis
| Correlation | 1 |
|---|
In_Minbps
Highly correlated
This variable is highly correlated with In_Averagebps and should be ignored for analysis
| Correlation | 1 |
|---|
In_TotalBytes
Numeric
| Distinct count | 1131 |
|---|---|
| Unique (%) | 99.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 8152100 |
|---|---|
| Minimum | 71316 |
| Maximum | 93149000 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 71316 |
|---|---|
| 5-th percentile | 1465400 |
| Q1 | 1862300 |
| Median | 3801300 |
| Q3 | 9082700 |
| 95-th percentile | 28628000 |
| Maximum | 93149000 |
| Range | 93077000 |
| Interquartile range | 7220400 |
Descriptive statistics
| Standard deviation | 11838000 |
|---|---|
| Coef of variation | 1.4521 |
| Kurtosis | 15.539 |
| Mean | 8152100 |
| MAD | 7362600 |
| Skewness | 3.5273 |
| Sum | 9269000000 |
| Variance | 140130000000000 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 22800000.0 | 2 | 0.2% |
|
| 14100000.0 | 2 | 0.2% |
|
| 12500000.0 | 2 | 0.2% |
|
| 25200000.0 | 2 | 0.2% |
|
| 14900000.0 | 2 | 0.2% |
|
| 12200000.0 | 2 | 0.2% |
|
| 23614508.0 | 1 | 0.1% |
|
| 2435771.0 | 1 | 0.1% |
|
| 1598796.0 | 1 | 0.1% |
|
| 16100000.0 | 1 | 0.1% |
|
| Other values (1121) | 1121 | 98.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 71316.0 | 1 | 0.1% |
|
| 668795.0 | 1 | 0.1% |
|
| 959590.0 | 1 | 0.1% |
|
| 1239124.0 | 1 | 0.1% |
|
| 1253925.0 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 78747384.0 | 1 | 0.1% |
|
| 79592920.0 | 1 | 0.1% |
|
| 87055336.0 | 1 | 0.1% |
|
| 90663760.0 | 1 | 0.1% |
|
| 93148592.0 | 1 | 0.1% |
|
In_TotalPkts
Numeric
| Distinct count | 1070 |
|---|---|
| Unique (%) | 94.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 16285 |
|---|---|
| Minimum | 260 |
| Maximum | 155820 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 260 |
|---|---|
| 5-th percentile | 4535 |
| Q1 | 5264 |
| Median | 6912 |
| Q3 | 21030 |
| 95-th percentile | 59289 |
| Maximum | 155820 |
| Range | 155560 |
| Interquartile range | 15766 |
Descriptive statistics
| Standard deviation | 18886 |
|---|---|
| Coef of variation | 1.1597 |
| Kurtosis | 6.3427 |
| Mean | 16285 |
| MAD | 13823 |
| Skewness | 2.2889 |
| Sum | 18516000 |
| Variance | 356660000 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 4563.0 | 3 | 0.3% |
|
| 6127.0 | 3 | 0.3% |
|
| 5049.0 | 2 | 0.2% |
|
| 4785.0 | 2 | 0.2% |
|
| 7085.0 | 2 | 0.2% |
|
| 4966.0 | 2 | 0.2% |
|
| 6562.0 | 2 | 0.2% |
|
| 4828.0 | 2 | 0.2% |
|
| 4498.0 | 2 | 0.2% |
|
| 4668.0 | 2 | 0.2% |
|
| Other values (1060) | 1115 | 98.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 260.0 | 1 | 0.1% |
|
| 1926.0 | 1 | 0.1% |
|
| 4071.0 | 1 | 0.1% |
|
| 4119.0 | 1 | 0.1% |
|
| 4227.0 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 95247.0 | 1 | 0.1% |
|
| 101977.0 | 1 | 0.1% |
|
| 105836.0 | 1 | 0.1% |
|
| 130224.0 | 1 | 0.1% |
|
| 155819.0 | 1 | 0.1% |
|
InterfaceID
Constant
This variable is constant and should be ignored for analysis
| Constant value | 10912.0 |
|---|
Label
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.0070361 |
|---|
| 0.0 |
1129
|
|---|---|
| 1.0 |
|
| Value | Count | Frequency (%) | |
| 0.0 | 1129 | 99.3% |
|
| 1.0 | 8 | 0.7% |
|
NodeID
Constant
This variable is constant and should be ignored for analysis
| Constant value | 3071 |
|---|
Out_Averagebps
Numeric
| Distinct count | 1134 |
|---|---|
| Unique (%) | 99.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 70059 |
|---|---|
| Minimum | 19401 |
| Maximum | 1120100 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 19401 |
|---|---|
| 5-th percentile | 25171 |
| Q1 | 27259 |
| Median | 31604 |
| Q3 | 90801 |
| 95-th percentile | 218710 |
| Maximum | 1120100 |
| Range | 1100700 |
| Interquartile range | 63542 |
Descriptive statistics
| Standard deviation | 81212 |
|---|---|
| Coef of variation | 1.1592 |
| Kurtosis | 33.811 |
| Mean | 70059 |
| MAD | 55701 |
| Skewness | 4.1346 |
| Sum | 79657000 |
| Variance | 6595400000 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 25557.0 | 2 | 0.2% |
|
| 26367.0 | 2 | 0.2% |
|
| 25885.0 | 2 | 0.2% |
|
| 29178.371 | 1 | 0.1% |
|
| 27121.027000000002 | 1 | 0.1% |
|
| 48689.219000000005 | 1 | 0.1% |
|
| 32939.59 | 1 | 0.1% |
|
| 27715.347999999998 | 1 | 0.1% |
|
| 28066.699 | 1 | 0.1% |
|
| 26889.355 | 1 | 0.1% |
|
| Other values (1124) | 1124 | 98.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 19400.658 | 1 | 0.1% |
|
| 20615.105 | 1 | 0.1% |
|
| 21740.0 | 1 | 0.1% |
|
| 22386.443 | 1 | 0.1% |
|
| 22457.477000000003 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 460657.0 | 1 | 0.1% |
|
| 575972.0 | 1 | 0.1% |
|
| 585471.0 | 1 | 0.1% |
|
| 802127.25 | 1 | 0.1% |
|
| 1120072.0 | 1 | 0.1% |
|
Out_AvgMultiCastPkts
Constant
This variable is constant and should be ignored for analysis
| Constant value | 0 |
|---|
Out_AvgUniCastPkts
Numeric
| Distinct count | 1124 |
|---|---|
| Unique (%) | 98.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 34.91 |
|---|---|
| Minimum | 10.232 |
| Maximum | 273.85 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 10.232 |
|---|---|
| 5-th percentile | 11.47 |
| Q1 | 12.681 |
| Median | 15.225 |
| Q3 | 42.294 |
| 95-th percentile | 118.15 |
| Maximum | 273.85 |
| Range | 263.62 |
| Interquartile range | 29.612 |
Descriptive statistics
| Standard deviation | 38.277 |
|---|---|
| Coef of variation | 1.0965 |
| Kurtosis | 4.8846 |
| Mean | 34.91 |
| MAD | 28.612 |
| Skewness | 2.1198 |
| Sum | 39693 |
| Variance | 1465.2 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 14.738667000000001 | 2 | 0.2% |
|
| 14.751629000000001 | 2 | 0.2% |
|
| 16.146053 | 2 | 0.2% |
|
| 16.160868 | 2 | 0.2% |
|
| 12.288703 | 2 | 0.2% |
|
| 12.649809 | 2 | 0.2% |
|
| 11.419868 | 2 | 0.2% |
|
| 11.149832 | 2 | 0.2% |
|
| 14.210896 | 2 | 0.2% |
|
| 14.783538 | 2 | 0.2% |
|
| Other values (1114) | 1117 | 98.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 10.231919 | 1 | 0.1% |
|
| 10.672061 | 1 | 0.1% |
|
| 10.720208 | 1 | 0.1% |
|
| 10.72609 | 1 | 0.1% |
|
| 10.80354 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 195.70985 | 1 | 0.1% |
|
| 229.43204 | 1 | 0.1% |
|
| 238.38585 | 1 | 0.1% |
|
| 244.60652000000002 | 1 | 0.1% |
|
| 273.84924 | 1 | 0.1% |
|
Out_MaxMultiCastPkts
Constant
This variable is constant and should be ignored for analysis
| Constant value | 0 |
|---|
Out_MaxUniCastPkts
Highly correlated
This variable is highly correlated with Out_AvgUniCastPkts and should be ignored for analysis
| Correlation | 1 |
|---|
Out_Maxbps
Highly correlated
This variable is highly correlated with Out_Minbps and should be ignored for analysis
| Correlation | 1 |
|---|
Out_MinMultiCastPkts
Constant
This variable is constant and should be ignored for analysis
| Constant value | 0 |
|---|
Out_MinUniCastPkts
Highly correlated
This variable is highly correlated with Out_MaxUniCastPkts and should be ignored for analysis
| Correlation | 1 |
|---|
Out_Minbps
Highly correlated
This variable is highly correlated with Out_Averagebps and should be ignored for analysis
| Correlation | 1 |
|---|
Out_TotalBytes
Numeric
| Distinct count | 1123 |
|---|---|
| Unique (%) | 98.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4693400 |
|---|---|
| Minimum | 54982 |
| Maximum | 75639000 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 54982 |
|---|---|
| 5-th percentile | 1699100 |
| Q1 | 1840000 |
| Median | 2136700 |
| Q3 | 6095700 |
| 95-th percentile | 14748000 |
| Maximum | 75639000 |
| Range | 75584000 |
| Interquartile range | 4255700 |
Descriptive statistics
| Standard deviation | 5420900 |
|---|---|
| Coef of variation | 1.155 |
| Kurtosis | 35.124 |
| Mean | 4693400 |
| MAD | 3718700 |
| Skewness | 4.1887 |
| Sum | 5336400000 |
| Variance | 29386000000000 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 11100000.0 | 4 | 0.4% |
|
| 14900000.0 | 3 | 0.3% |
|
| 10000000.0 | 2 | 0.2% |
|
| 15600000.0 | 2 | 0.2% |
|
| 17000000.0 | 2 | 0.2% |
|
| 11600000.0 | 2 | 0.2% |
|
| 13800000.0 | 2 | 0.2% |
|
| 10400000.0 | 2 | 0.2% |
|
| 14700000.0 | 2 | 0.2% |
|
| 12200000.0 | 2 | 0.2% |
|
| Other values (1113) | 1114 | 98.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 54982.0 | 1 | 0.1% |
|
| 507456.0 | 1 | 0.1% |
|
| 1274024.0 | 1 | 0.1% |
|
| 1391460.0 | 1 | 0.1% |
|
| 1467459.0 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 30100000.0 | 1 | 0.1% |
|
| 38872152.0 | 1 | 0.1% |
|
| 39519888.0 | 1 | 0.1% |
|
| 54100000.0 | 1 | 0.1% |
|
| 75638840.0 | 1 | 0.1% |
|
Out_TotalPkts
Numeric
| Distinct count | 1045 |
|---|---|
| Unique (%) | 91.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 18698 |
|---|---|
| Minimum | 306 |
| Maximum | 147940 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 306 |
|---|---|
| 5-th percentile | 6193.8 |
| Q1 | 6848 |
| Median | 8222 |
| Q3 | 22393 |
| 95-th percentile | 63298 |
| Maximum | 147940 |
| Range | 147640 |
| Interquartile range | 15545 |
Descriptive statistics
| Standard deviation | 20370 |
|---|---|
| Coef of variation | 1.0894 |
| Kurtosis | 4.5435 |
| Mean | 18698 |
| MAD | 15274 |
| Skewness | 2.0756 |
| Sum | 21259000 |
| Variance | 414920000 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 6612.0 | 3 | 0.3% |
|
| 7573.0 | 3 | 0.3% |
|
| 6994.0 | 3 | 0.3% |
|
| 8617.0 | 2 | 0.2% |
|
| 7592.0 | 2 | 0.2% |
|
| 8703.0 | 2 | 0.2% |
|
| 7674.0 | 2 | 0.2% |
|
| 6591.0 | 2 | 0.2% |
|
| 6308.0 | 2 | 0.2% |
|
| 6412.0 | 2 | 0.2% |
|
| Other values (1035) | 1114 | 98.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 306.0 | 1 | 0.1% |
|
| 2084.0 | 1 | 0.1% |
|
| 5412.0 | 1 | 0.1% |
|
| 5525.0 | 1 | 0.1% |
|
| 5763.0 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 104010.0 | 1 | 0.1% |
|
| 106691.0 | 1 | 0.1% |
|
| 123888.0 | 1 | 0.1% |
|
| 128775.0 | 1 | 0.1% |
|
| 147945.0 | 1 | 0.1% |
|
index
Numeric
| Distinct count | 1110 |
|---|---|
| Unique (%) | 97.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 547.93 |
|---|---|
| Minimum | 0 |
| Maximum | 1122 |
| Zeros (%) | 0.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 29.8 |
| Q1 | 259 |
| Median | 547 |
| Q3 | 836 |
| 95-th percentile | 1064.2 |
| Maximum | 1122 |
| Range | 1122 |
| Interquartile range | 577 |
Descriptive statistics
| Standard deviation | 331.72 |
|---|---|
| Coef of variation | 0.60541 |
| Kurtosis | -1.2162 |
| Mean | 547.93 |
| MAD | 287.56 |
| Skewness | 0.010258 |
| Sum | 622993 |
| Variance | 110040 |
| Memory size | 9.0 KiB |
| Value | Count | Frequency (%) | |
| 0 | 2 | 0.2% |
|
| 13 | 2 | 0.2% |
|
| 26 | 2 | 0.2% |
|
| 25 | 2 | 0.2% |
|
| 24 | 2 | 0.2% |
|
| 23 | 2 | 0.2% |
|
| 22 | 2 | 0.2% |
|
| 21 | 2 | 0.2% |
|
| 19 | 2 | 0.2% |
|
| 18 | 2 | 0.2% |
|
| Other values (1100) | 1117 | 98.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 2 | 0.2% |
|
| 1 | 2 | 0.2% |
|
| 2 | 2 | 0.2% |
|
| 3 | 2 | 0.2% |
|
| 4 | 2 | 0.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1118 | 1 | 0.1% |
|
| 1119 | 1 | 0.1% |
|
| 1120 | 1 | 0.1% |
|
| 1121 | 1 | 0.1% |
|
| 1122 | 1 | 0.1% |
|
| NodeID | InterfaceID | DateTime | In_Averagebps | In_Minbps | In_Maxbps | In_TotalBytes | In_TotalPkts | In_AvgUniCastPkts | In_MinUniCastPkts | In_MaxUniCastPkts | In_AvgMultiCastPkts | In_MinMultiCastPkts | In_MaxMultiCastPkts | Out_Averagebps | Out_Minbps | Out_Maxbps | Out_TotalBytes | Out_TotalPkts | Out_AvgUniCastPkts | Out_MaxUniCastPkts | Out_MinUniCastPkts | Out_AvgMultiCastPkts | Out_MinMultiCastPkts | Out_MaxMultiCastPkts | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3071 | 10912.0 | 2018-11-14 00:03:00 | 33568.895 | 33568.895 | 33568.895 | 2275179.0 | 6602.0 | 12.176065 | 12.176065 | 12.176065 | 0.0 | 0.0 | 0.0 | 33775.219 | 33775.219 | 33775.219 | 2289163.0 | 8177.0 | 15.080837 | 15.080837 | 15.080837 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 3071 | 10912.0 | 2018-11-14 00:12:00 | 23912.543 | 23912.543 | 23912.543 | 1614121.0 | 4978.0 | 9.218379 | 9.218379 | 9.218379 | 0.0 | 0.0 | 0.0 | 25298.492 | 25298.492 | 25298.492 | 1707674.0 | 6625.0 | 12.268333 | 12.268333 | 12.268333 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 3071 | 10912.0 | 2018-11-14 00:21:00 | 24765.508 | 24765.508 | 24765.508 | 1671697.0 | 5049.0 | 9.349859 | 9.349859 | 9.349859 | 0.0 | 0.0 | 0.0 | 26966.318 | 26966.318 | 26966.318 | 1820254.0 | 6656.0 | 12.325740 | 12.325740 | 12.325740 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 3071 | 10912.0 | 2018-11-14 00:30:00 | 36084.949 | 36084.949 | 36084.949 | 2435771.0 | 5414.0 | 10.025774 | 10.025774 | 10.025774 | 0.0 | 0.0 | 0.0 | 26080.227 | 26080.227 | 26080.227 | 1760442.0 | 6726.0 | 12.455367 | 12.455367 | 12.455367 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 3071 | 10912.0 | 2018-11-14 00:39:00 | 23879.447 | 23879.447 | 23879.447 | 1611887.0 | 4751.0 | 8.798016 | 8.798016 | 8.798016 | 0.0 | 0.0 | 0.0 | 24752.441 | 24752.441 | 24752.441 | 1670815.0 | 6205.0 | 11.490567 | 11.490567 | 11.490567 | 0.0 | 0.0 | 0.0 | 0.0 |
#creating ID for future reference and mdoeling
df['NodeID_InterfaceID']=df.NodeID +'_'+ df.InterfaceID
#Removing Zero columns and old IDs
df.drop(columns=['NodeID','InterfaceID',
'Out_AvgMultiCastPkts','Out_MinMultiCastPkts','Out_MaxMultiCastPkts',
'In_AvgMultiCastPkts','In_MinMultiCastPkts','In_MaxMultiCastPkts'], inplace=True)
#creating time features, more can be added in the future
df['TotalMinutes'] = df.DateTime.dt.hour*60 + df.DateTime.dt.minute
df['Day'] = df.DateTime.dt.day
From the profiling report results lets remove some variables
We will only keep the Average variable for each group, the rest will be removed because of high correlation (ρ = 1):
In_MaxUniCastPkts is highly correlated with In_MinUniCastPkts (ρ = 1) Rejected
In_MinUniCastPkts is highly correlated with In_AvgUniCastPkts (ρ = 1) Rejected
Out_MinUniCastPkts is highly correlated with Out_MaxUniCastPkts (ρ = 1) Rejected
Out_MaxUniCastPkts is highly correlated with Out_AvgUniCastPkts (ρ = 1) Rejected
In_Maxbps is highly correlated with In_Minbps (ρ = 1) Rejected
In_Minbps is highly correlated with In_Averagebps (ρ = 1) Rejected
Out_Maxbps is highly correlated with Out_Minbps (ρ = 1) Rejected
Out_Minbps is highly correlated with Out_Averagebps (ρ = 1) Rejected
#create compact dataframe
columns = ['DateTime', 'TotalMinutes', 'Day',
'In_TotalPkts', 'Out_TotalPkts',
'In_AvgUniCastPkts', 'Out_AvgUniCastPkts',
'In_Averagebps', 'Out_Averagebps',
'In_TotalBytes' ,'Out_TotalBytes',
'Label']
df4model = df[columns]
df4model.head()
| DateTime | TotalMinutes | Day | In_TotalPkts | Out_TotalPkts | In_AvgUniCastPkts | Out_AvgUniCastPkts | In_Averagebps | Out_Averagebps | In_TotalBytes | Out_TotalBytes | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2018-11-14 00:03:00 | 3 | 14 | 6602.0 | 8177.0 | 12.176065 | 15.080837 | 33568.895 | 33775.219 | 2275179.0 | 2289163.0 | 0.0 |
| 1 | 2018-11-14 00:12:00 | 12 | 14 | 4978.0 | 6625.0 | 9.218379 | 12.268333 | 23912.543 | 25298.492 | 1614121.0 | 1707674.0 | 0.0 |
| 2 | 2018-11-14 00:21:00 | 21 | 14 | 5049.0 | 6656.0 | 9.349859 | 12.325740 | 24765.508 | 26966.318 | 1671697.0 | 1820254.0 | 0.0 |
| 3 | 2018-11-14 00:30:00 | 30 | 14 | 5414.0 | 6726.0 | 10.025774 | 12.455367 | 36084.949 | 26080.227 | 2435771.0 | 1760442.0 | 0.0 |
| 4 | 2018-11-14 00:39:00 | 39 | 14 | 4751.0 | 6205.0 | 8.798016 | 11.490567 | 23879.447 | 24752.441 | 1611887.0 | 1670815.0 | 0.0 |
#check columns to plot
str(columns[1:11])
"['TotalMinutes', 'Day', 'In_TotalPkts', 'Out_TotalPkts', 'In_AvgUniCastPkts', 'Out_AvgUniCastPkts', 'In_Averagebps', 'Out_Averagebps', 'In_TotalBytes', 'Out_TotalBytes']"
# Check distribution for variables
for column in columns[1:11]:
a = df4model.loc[df4model['Label'] == 0, column]
b = df4model.loc[df4model['Label'] == 1, column]
fig, axs = plt.subplots()
axs.hist([a,b], bins=32, stacked=True, color=['blue', 'red'], label=['normal', 'alert!'])
plt.legend()
plt.title(column)
plt.show()
# plot in time of labeled alerts and selected variables
x = df4model['DateTime']
for column in columns[3:8]:
fig, ax = plt.subplots()
y1 = df4model[column]
y2 = df4model['Label']*df4model[column].max() *0.5
ax.plot(x, y1, color='grey')
ax.plot(x, y2, color='red' )
plt.legend()
plt.show()
Due to the small dataset, We will only create 2 unsupervised models:
1) Isolation Forest
2) One class SVM
Novel algorithm that uses the concept of decision tree to find the 'isolated' values
# Take useful feature and standardize them
data = df4model[columns[1:11]]
min_max_scaler = preprocessing.StandardScaler()
np_scaled = min_max_scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)
# train isolation forest
model = IsolationForest(contamination = 0.01)
model.fit(data)
# add the data to the main
df4model['Anomaly'] = pd.Series(model.predict(data))
df4model['Anomaly'] = df4model['Anomaly'].map( {1: 0, -1: 1} )
print(df4model['Anomaly'].value_counts())
0 1125 1 12 Name: Anomaly, dtype: int64
x = df4model['DateTime']
for column in columns[1:11]:
fig, ax = plt.subplots()
y1 = df4model[column]
y3 = df4model['Anomaly']*df4model[column].max() *0.5
ax.plot(x, y1, color='grey')
ax.plot(x, y3, color='blue' )
plt.legend()
plt.show()
In this case the model gets used to normality and will trigger an alarm when gets something abnormal compared to its training set
# Take useful feature and standardize them
train = df4model[columns[1:]][df4model['Label']==0]
test = df4model[columns[1:]]
min_max_scaler = preprocessing.StandardScaler()
train_scaled = min_max_scaler.fit_transform(train)
train = pd.DataFrame(train_scaled)
test_scaled = min_max_scaler.fit_transform(test)
test = pd.DataFrame(test_scaled)
# train one class SVM
model = OneClassSVM(nu=0.001)
model.fit(train)
# add the data to the main
df4model['Anomaly2'] = pd.Series(model.predict(test))
df4model['Anomaly2'] = df4model['Anomaly2'].map( {1: 0, -1: 1} )
print(df4model['Anomaly2'].value_counts())
0 1112 1 25 Name: Anomaly2, dtype: int64
x = df4model['DateTime']
for column in columns[1:8]:
fig, ax = plt.subplots()
y1 = df4model[column]
y4 = df4model['Anomaly2']*df4model[column].max() *0.5
ax.plot(x, y1, color='grey')
ax.plot(x, y4, color='g')
plt.legend()
plt.show()
(This notebook is only for demo purposes)
- More Data its needed to get adequate results
- Set up two flows (supervised and unsupervised) to create more models
- Check data to create more features and find patterns